Set-up packages // libraries and prepare for data import:
## Uncomment install.packages and run outside of notebook environment:
## install.packages(c("psych" ,"xtable", "tidyverse", "jsonlite", "likert", "ggplot2", "ploty", "mosaic"))
library(psych)
library(likert)
library(jsonlite)
library(ggplot2)
library(mosaic)
library(plotly)
source("http://pcwww.liv.ac.uk/~william/R/crosstab.r")
theme_set(theme_classic())
Set-up directories, import and clean data:
rm(list=ls())
setwd("/Users/allieblaising/desktop/bang/R")
getwd()
[1] "/Users/allieblaising/Desktop/bang/R"
dataPath = "../.data"
## Define function to extract survey results:
extractSurvey = function(frame,survey) {
rounds = seq(1,length(frame$results.format[[1]]))
roundResponses = lapply(rounds, function(round) {
getCol = paste("results.",survey,".",round, sep="")
surveyCols = Filter(function(x) grepl(getCol,x),names(frame))
newCols = lapply(surveyCols, function(x) gsub(getCol,paste("results.",survey, sep=""),x) )
surveyFrame = frame[,surveyCols]
if (is.null(newCols)) {return("No newCols")}
names(surveyFrame) = newCols
surveyFrame$id = frame$id
surveyFrame$round = round
surveyFrame$batch = frame$batch
surveyFrame$rooms = frame$rooms
surveyFrame$manipulation = frame$results.manipulationCheck
surveyFrame$blacklist = frame$results.blacklistCheck
return(surveyFrame)
})
return(Reduce(rbind,roundResponses))
}
#Find directory for import (be sure to verify that batch #s align from bangData import and imports below):
batches = dir(dataPath, pattern = "^[0-9]+$" )
completeBatches = Filter(function(batch) {
if (any(dir(paste(dataPath,batch,sep="/")) == "batch.json") && (any(dir(paste(dataPath,batch,sep="/")) == "users.json")) ) {
batchData = read_json(paste(dataPath,batch,"batch.json",sep="/"), simplifyVector = TRUE)
return(any(batchData$batchComplete == TRUE))
}
return(FALSE)
}, batches)
userFiles = lapply(completeBatches, function(batch) {
userFile = read_json(paste(dataPath,batch,"users.json",sep="/"), simplifyVector=TRUE)
return(flatten(userFile, recursive = TRUE))
})
Error in flatten(userFile, recursive = TRUE) :
unused argument (recursive = TRUE)
More cleaning before visualizations:
## Apply extract survey function to extract the right columns and rows for viability survey:
survey = 'viabilityCheck'
frame <- extractSurvey(overlappingFiles, survey)
## Reduce to vertically combine rows in roundwithRooms list:
finalRounds = as.data.frame(Reduce(rbind,roundsWithRooms))
## Subset incomplete cases from viability survey dataframe:
## MSB: complete.cases doesn't work when there aren't NAs, but empty lists, alternatives rather than subsetting where
## blacklist is empty? Doens't work: test <- complete.cases(data)
data <- frame[frame$manipulation!="",]
## Rename room to rooms so that both are retained in future merge:
data <- rename(data, rooms = "rooms")
## Subset incomplete cases for final rounds dataframe:
data2 <- finalRounds[finalRounds$results.manipulationCheck!="", ]
## Select only variables of interest from final rounds:
data2 = data2 %>% select(id, batch, room, bonus, name, friends,
friends_history, results.condition, results.format,
results.manipulation,results.manipulationCheck,results.blacklistCheck, round)
## Convert to compatible data types before merge (this should be simplified)
data2$batch <- unlist(data2$batch)
data$batch <- unlist(data$batch)
data2$round <- unlist(data2$round)
data2$id <- unlist(data2$id)
## Before merge, data and data2 should ahve the same # of observations
## Merge columns by id, round and batch #s:
data <- left_join(data, data2, by=NULL)
Joining, by = c("id", "round", "batch")
## Subset only observations with batch #s in complete batches
allConditions <- data[data$batch %in% completeBatches, ]
Conditionally assign conditions based on treatment and results column:
## Messy, but robust? Verify, verify, verify.
data <- data %>% mutate(
condition = case_when(
results.condition=='treatment' & results.format=="c(1, 2, 1)" & round==1 ~ "A",
results.condition=='treatment' & results.format=="c(1, 2, 1)" & round==2 ~ "B",
results.condition=='treatment' & results.format=="c(1, 2, 1)" & round==3 ~ "Ap",
results.condition=='treatment' & results.format=="c(1, 1, 2)" & round==1 ~ "A",
results.condition=='treatment' & results.format=="c(1, 1, 2)" & round==2 ~ "Ap",
results.condition=='treatment' & results.format=="c(1, 1, 2)" & round==3 ~ "B",
results.condition=='treatment' & results.format=="c(2, 1, 1)" & round==1 ~ "B",
results.condition=='treatment' & results.format=="c(2, 1, 1)" & round==2 ~ "A",
results.condition=='treatment' & results.format=="c(2, 1, 1)" & round==3 ~ "Ap" ,
results.condition=='control' & results.format=="c(1, 2, 1)" & round==1 ~ "A",
results.condition=='control' & results.format=="c(1, 2, 1)" & round==2 ~ "B",
results.condition=='control' & results.format=="c(1, 2, 1)" & round==3 ~ "Ap",
results.condition=='control' & results.format=="c(1, 1, 2)" & round==1 ~ "A",
results.condition=='control' & results.format=="c(1, 1, 2)" & round==2 ~ "Ap",
results.condition=='control' & results.format=="c(1, 1, 2)" & round==3 ~ "B",
results.condition=='control' & results.format=="c(2, 1, 1)" & round==1 ~ "B",
results.condition=='control' & results.format=="c(2, 1, 1)" & round==2 ~ "A",
results.condition=='control' & results.format=="c(2, 1, 1)" & round==3 ~ "Ap" ,
results.condition=='baseline' & results.format=="c(1, 2, 3)" & round==1 ~ "A" ,
results.condition=='baseline' & results.format=="c(1, 2, 3)" & round==2 ~ "B" ,
results.condition=='baseline' & results.format=="c(1, 2, 3)" & round==3 ~ "C"
))
Set-up for factors for viability questions:
data <- rename(data, "repeatTeam" = results.viabilityCheck.15)
## Remove observations where viability survey wasn't on (remove this line if we want to keep observations with viability off):
data <- na.omit(data)
levels <- c("Strongly Disagree", "Disagree", "Neutral","Agree", "Strongly Agree")
clean <- data %>%
mutate_at(.vars = vars(contains("results.viabilityCheck")), funs(factor(., levels = levels)))
## Viabilitylikert visuals:
## Subset only viabilitySurvey columns and condition column (condition column used to visualize likert responses for conditions)
viabilityLikert <- select(clean, contains("results.viabilityCheck"), "condition", "results.condition")
viabilityLabels = c("1. The members of this team could work for a long time together"
, "2. Most of the members of this team would welcome the opportunity to work as a group again in the future." ,
"3. This team has the capacity for long-term success.",
"4. This team has what it takes to be effective in the future.",
"5. This team would work well together in the future." ,
" 6. This team has positioned itself well for continued success.",
" 7. This team has the ability to perform well in the future. ",
" 8. This team has the ability to function as an ongoing unit." ,
" 9. This team should continue to function as a unit. ",
" 10. This team has the resources to perform well in the future. ",
" 11. This team is well positioned for growth over time. ",
" 12. This team can develop to meet future challenges. ",
" 13. This team has the capacity to sustain itself. ",
" 14. This team has what it takes to endure in future performance episodes.", "condition", "results.condition")
names(viabilityLikert) <- rep(viabilityLabels)
Likert visualizations:
## Likert graph for all viability responses across all conditions (i.e. baseline, control and treatment):
likert.out <- likert(viabilityLikert[-c(15:16)])
plot(likert.out)

## Subset A and Ap (i.e. A prime) responses for treatment group:
# Treatment + A:
treatmentA <- viabilityLikert %>% filter(condition=="A" & results.condition=="treatment")
likert.treatmentA <- likert(treatmentA[-c(15:16)])
plot(likert.treatmentA)

# Treatment + Ap:
treatmentAp <- viabilityLikert %>% filter(condition=="Ap" & results.condition=="treatment")
likert.treatmentAp <- likert(treatmentAp[-c(15:16)])
plot(likert.treatmentAp)

# Treatment + B:
treatmentB <- viabilityLikert %>% filter(condition=="B" & results.condition=="treatment")
likert.treatmentB <- likert(treatmentB[-c(15:16)])
plot(likert.treatmentAp)

## Customize code above to filter on other conditions and treatments of interest.
Exploratory data analysis part #1:
## Create a new dataframe that converts factors to numeric for statistical analyses:
stats <- clean %>% mutate_if(is.factor, as.numeric)
for (i in 1:nrow(stats)) {
stats$sum[i] <- sum(stats[i,1:14])
}
stats$median <- median(stats$sum)
stats$mean <- mean(stats$sum)
## Revalue repeat team: keep plyr b/c some weird R stuff requires library to be called directly (recode values to )
stats$repeatTeam <- plyr::revalue(stats$repeatTeam, c("Yes"="0", "No"="1"))
stats$repeatTeam <- plyr::revalue(stats$repeatTeam, c("Keep this team"="0", "Do not keep this team"="1"))
## Convert to compatible classes for team grouping:
stats$repeatTeam <- as.numeric(stats$repeatTeam)
stats$results.condition <- unlist(stats$results.condition)
stats$results.format <- as.character(stats$results.format)
stats$room <- unlist(stats$room)
## Dplyr to group teams and summarise variables for each group: group_by to find teams and summarise to compact individual results into group level results (i.e. one row of variable results per team)
groupedProportion <- stats %>%
group_by(room, batch, round, condition, results.condition, results.format) %>%
summarise(n=n(), mean=mean(sum), median=median(sum),prop=sum(repeatTeam)/n) %>%
## Filter out all teams with n=1 (i.e. a person in a chat room alone)
filter(n>2)
## Individual proportion:
individualProportion <- stats %>% group_by(round, batch, room) %>%
mutate(sum=sum, mean=mean(sum), median=median(sum), n=n(),prop=sum(repeatTeam)/n) %>% filter(n>1)
## Table showing how many teams we've run in each condition combination:
table(groupedProportion$condition, groupedProportion$results.condition)
control treatment
A 10 12
Ap 10 12
B 7 13
Exploratory data analysis part #2:
## Plot of prop and mean per team with standard error:
ggplot(groupedProportion, aes(x=prop, y=mean)) +
geom_point() +
stat_smooth(method = "lm", col = "red") +
labs(subtitle="",
x="Proportion of answers to Q15: ",
y="Numeric sum of viability measures questions (range: 7-70)") + facet_grid(condition ~ results.format)

## Unlist for condition for graph:
groupedProportion$results.condition <- unlist(groupedProportion$results.condition)
## Team viability sum and Q15 factor repeat team answers facet_grid by condition, results.condition and results.format (all users)
g <- ggplot(stats, aes(factor(repeatTeam), sum))
g + geom_boxplot(varwidth=T, fill="plum") +
labs(subtitle="Sum of viability measures grouped by team fracture value",
x="Binary fracture value",
y="Numeric sum of viability measures questions (range: 7-70)") + facet_grid(condition ~ results.format)

g <- ggplot(stats, aes(factor(repeatTeam), sum))
g + geom_boxplot(varwidth=T, fill="plum") +
labs(subtitle="Individual fracture score vs. mean viability:",
x="",
y="Numeric sum of viability measures questions (range: 7-70)") + facet_grid(condition ~ results.condition)

## groupedProportion: anlaysis on team level
ggplot(groupedProportion, aes(x=prop, y=mean)) +
geom_point() +
stat_smooth(method = "lm", col = "red") + labs(title="Team fracture proportion vs. team mean viability")

## Mean vs. median plot for viability sums per team:
ggplot(groupedProportion, aes(x=mean, y=median)) +
geom_point() +
stat_smooth(method = "lm", col = "red") + labs(main="Scatterplot of median vs. mean for team viability sums")

ggplot(data=groupedProportion, aes(groupedProportion$prop)) +
geom_histogram(breaks=seq(0, 1, by=0.20),
col="red",
fill="green",
alpha=.2) + labs(title="Count of team proportions of fracture value responses",
x="", y="Count") + facet_grid(condition ~ results.format)

Probability of fracture across teams and condition combinations:


Probability of fracture given condition in treatment:
tally(~fractureGreaterEqual50 | condition, data = conditionalPropTreatment, format = "proportion")
condition
fractureGreaterEqual50 A Ap
0 0.5405405 0.6756757
1 0.4594595 0.3243243
Boxplot for fracture scores vs. viability score sum (7-70)

Absolute change in fracture proportions between conditions:
Histogram comparisons of fracture ratios across treatment, control and learning effect conditions
ggplot(data=groupedProportionFractureTreatment, aes(fracture)) +
geom_bar(breaks=seq(0, 1, by=0.20),
col="red",
fill="green",
alpha=.2) + labs(title="Fracture ratios: treatment condition") + facet_grid(condition ~ results.condition)
Ignoring unknown parameters: breaks

ggplot(data=groupedProportionFractureControl, aes(fracture)) +
geom_bar(breaks=seq(0, 1, by=0.20),
col="red",
fill="green",
alpha=.2) + labs(title="Fracture ratios: control round") + facet_grid(condition ~ results.condition)
Ignoring unknown parameters: breaks

ggplot(data=groupedProportionFractureLE, aes(fracture)) +
geom_bar(breaks=seq(0, 1, by=0.20),
col="red",
fill="green",
alpha=.2) + labs(title="Fracture ratios: learning effect condition") + facet_grid(results.condition ~ round)
Ignoring unknown parameters: breaks

Graphs for mean distrbution per treatment condition:
## Mean viability distribution graph: treatment condition, A group
barfill <- "#4271AE"
barlines <- "#1F3552"
## Mean for treatment and A group: ##fill-in)
meanDistributionTreatmentA <- ggplot(treatmentAStats, aes(x = sum)) +
geom_histogram(aes(y = ..count..), binwidth = 5,
colour = barlines, fill = barfill) +
scale_x_continuous(name = "Median viability sum",
breaks = seq(0, 100, 20),
limits=c(0, 70)) +
scale_y_continuous(name = "count") +
ggtitle("Frequency of sum of viability scores: N=##fill-in") +
theme_bw() +
theme(axis.line = element_line(size=1, colour = "black"),
panel.grid.major = element_line(colour = "#d3d3d3"),
panel.grid.minor = element_blank(),
panel.border = element_blank(), panel.background = element_blank(),
plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
text=element_text(family="Tahoma"),
axis.text.x=element_text(colour="black", size = 9),
axis.text.y=element_text(colour="black", size = 9)) +
geom_vline(xintercept = ##fill-in, size = 1, colour = "#FF3721",
linetype = "dashed")
Error: unexpected '=' in:
" geom_vline(xintercept = ##fill-in, size = 1, colour = "#FF3721",
linetype ="
Chat data:
Conditional probabilities:
CHI Squared:
---
title: "R Notebook"
output: html_notebook
---

## Set-up packages // libraries and prepare for data import: 
```{r}
## Uncomment install.packages and run outside of notebook environment: 
## install.packages(c("psych" ,"xtable", "tidyverse", "jsonlite", "likert", "ggplot2", "ploty", "mosaic"))

library(psych)
library(likert)
library(jsonlite)
library(ggplot2)
library(mosaic)
library(plotly)
source("http://pcwww.liv.ac.uk/~william/R/crosstab.r")
theme_set(theme_classic())
```

## Set-up directories, import and clean data: 
```{r}
rm(list=ls())
setwd("/Users/allieblaising/desktop/bang/R") 
getwd()
dataPath = "../.data"
## Define function to extract survey results: 
extractSurvey = function(frame,survey) {
  rounds = seq(1,length(frame$results.format[[1]]))
  roundResponses = lapply(rounds, function(round) {
    getCol = paste("results.",survey,".",round, sep="")
    surveyCols = Filter(function(x) grepl(getCol,x),names(frame))
    newCols = lapply(surveyCols, function(x) gsub(getCol,paste("results.",survey, sep=""),x) )
    surveyFrame = frame[,surveyCols]
    if (is.null(newCols)) {return("No newCols")}
    names(surveyFrame) = newCols
    surveyFrame$id = frame$id
    surveyFrame$round = round
    surveyFrame$batch = frame$batch
    surveyFrame$rooms = frame$rooms
    surveyFrame$manipulation = frame$results.manipulationCheck
    surveyFrame$blacklist = frame$results.blacklistCheck
    return(surveyFrame)
  })
  return(Reduce(rbind,roundResponses))
}
#Find directory for import (be sure to verify that batch #s align from bangData import and imports below): 
batches = dir(dataPath, pattern = "^[0-9]+$" )
completeBatches = Filter(function(batch) { 
  if (any(dir(paste(dataPath,batch,sep="/")) == "batch.json") && (any(dir(paste(dataPath,batch,sep="/")) == "users.json")) ) {
    batchData = read_json(paste(dataPath,batch,"batch.json",sep="/"), simplifyVector = TRUE)
    return(any(batchData$batchComplete == TRUE))
  } 
  return(FALSE)
}, batches)
userFiles = lapply(completeBatches, function(batch) {
  userFile = read_json(paste(dataPath,batch,"users.json",sep="/"), simplifyVector=TRUE)
  return(flatten(userFile, recursive = TRUE))
})

## Retroactively find rooms from chat data: 
overlappingFiles = Reduce(function(x,y) merge(x, y, all=TRUE), userFiles)
roundsWithRooms = apply(overlappingFiles,1,function(x) {
  roomsForIndividual = lapply(seq(1,3),function(y) {
    x$room = x$rooms[y]
    x$round = y
    return(x)
  })
  return(Reduce(rbind,roomsForIndividual))
})
library(tidyverse)

```

## More cleaning before visualizations: 
```{r}
## Apply extract survey function to extract the right columns and rows for viability survey: 
survey = 'viabilityCheck'
frame <- extractSurvey(overlappingFiles, survey)
## Reduce to vertically combine rows in roundwithRooms list:  
finalRounds = as.data.frame(Reduce(rbind,roundsWithRooms))
## Subset incomplete cases from viability survey dataframe: 
## MSB: complete.cases doesn't work when there aren't NAs, but empty lists, alternatives rather than subsetting where 
## blacklist is empty? Doens't work: test <- complete.cases(data)
data <- frame[frame$manipulation!="",]
## Rename room to rooms so that both are retained in future merge: 
data <- rename(data, rooms = "rooms")
## Subset incomplete cases for final rounds dataframe: 
data2 <- finalRounds[finalRounds$results.manipulationCheck!="", ]
## Select only variables of interest from final rounds: 
data2 = data2 %>% select(id, batch, room, bonus, name, friends, 
                         friends_history, results.condition, results.format,
                         results.manipulation,results.manipulationCheck,results.blacklistCheck, round)
## Convert to compatible data types before merge (this should be simplified)
data2$batch <- unlist(data2$batch)
data$batch <- unlist(data$batch)
data2$round <- unlist(data2$round)
data2$id <- unlist(data2$id)
## Before merge, data and data2 should ahve the same # of observations
## Merge columns by id, round and batch #s: 
data <- left_join(data, data2, by=NULL)
## Subset only observations with batch #s in complete batches 
allConditions <- data[data$batch %in% completeBatches, ]
```
## Conditionally assign conditions based on treatment and results column: 
```{r}
## Messy, but robust? Verify, verify, verify.  
data <- data %>% mutate(
  condition = case_when(
    results.condition=='treatment' & results.format=="c(1, 2, 1)" & round==1 ~ "A", 
    results.condition=='treatment' & results.format=="c(1, 2, 1)" & round==2 ~ "B", 
    results.condition=='treatment' & results.format=="c(1, 2, 1)" & round==3 ~ "Ap", 
    results.condition=='treatment' & results.format=="c(1, 1, 2)" & round==1 ~ "A", 
    results.condition=='treatment' & results.format=="c(1, 1, 2)" & round==2 ~ "Ap", 
    results.condition=='treatment' & results.format=="c(1, 1, 2)" & round==3 ~ "B", 
    results.condition=='treatment' & results.format=="c(2, 1, 1)" & round==1 ~ "B", 
    results.condition=='treatment' & results.format=="c(2, 1, 1)" & round==2 ~ "A", 
    results.condition=='treatment' & results.format=="c(2, 1, 1)" & round==3 ~ "Ap" ,
    results.condition=='control' & results.format=="c(1, 2, 1)" & round==1 ~ "A", 
    results.condition=='control' & results.format=="c(1, 2, 1)" & round==2 ~ "B", 
    results.condition=='control' & results.format=="c(1, 2, 1)" & round==3 ~ "Ap", 
    results.condition=='control' & results.format=="c(1, 1, 2)" & round==1 ~ "A", 
    results.condition=='control' & results.format=="c(1, 1, 2)" & round==2 ~ "Ap", 
    results.condition=='control' & results.format=="c(1, 1, 2)" & round==3 ~ "B", 
    results.condition=='control' & results.format=="c(2, 1, 1)" & round==1 ~ "B", 
    results.condition=='control' & results.format=="c(2, 1, 1)" & round==2 ~ "A", 
    results.condition=='control' & results.format=="c(2, 1, 1)" & round==3 ~ "Ap" , 
    results.condition=='baseline' & results.format=="c(1, 2, 3)" & round==1 ~ "A" ,
    results.condition=='baseline' & results.format=="c(1, 2, 3)" & round==2 ~ "B" ,
    results.condition=='baseline' & results.format=="c(1, 2, 3)" & round==3 ~ "C" 
  )) 
```

## Set-up for factors for viability questions: 
```{r}
data <- rename(data, "repeatTeam" = results.viabilityCheck.15)
## Remove observations where viability survey wasn't on (remove this line if we want to keep observations with viability off): 
data <- na.omit(data)
levels <- c("Strongly Disagree", "Disagree", "Neutral","Agree", "Strongly Agree") 
clean <- data %>% 
  mutate_at(.vars = vars(contains("results.viabilityCheck")), funs(factor(., levels = levels))) 
## Viabilitylikert visuals: 
## Subset only viabilitySurvey columns and condition column (condition column used to visualize likert responses for conditions)
viabilityLikert <- select(clean, contains("results.viabilityCheck"), "condition", "results.condition")
viabilityLabels = c("1. The members of this team could work for a long time together" 
                    , "2. Most of the members of this team would welcome the opportunity to work as a group again in the future." , 
                    "3. This team has the capacity for long-term success.", 
                    "4. This team has what it takes to be effective in the future.", 
                    "5. This team would work well together in the future." , 
                    " 6. This team has positioned itself well for continued success.",  
                    " 7. This team has the ability to perform well in the future. ", 
                    " 8. This team has the ability to function as an ongoing unit." , 
                    " 9. This team should continue to function as a unit. ", 
                    " 10. This team has the resources to perform well in the future. ", 
                    " 11. This team is well positioned for growth over time. ", 
                    " 12. This team can develop to meet future challenges. ", 
                    " 13. This team has the capacity to sustain itself. ", 
                    " 14. This team has what it takes to endure in future performance episodes.", "condition", "results.condition") 
names(viabilityLikert) <- rep(viabilityLabels) 
```

## Likert visualizations: 
```{r}
## Likert graph for all viability responses across all conditions (i.e. baseline, control and treatment): 
likert.out <- likert(viabilityLikert[-c(15:16)]) 
plot(likert.out)
## Subset A and Ap (i.e. A prime) responses for treatment group: 
# Treatment + A: 
treatmentA <- viabilityLikert %>% filter(condition=="A" & results.condition=="treatment") 
likert.treatmentA <- likert(treatmentA[-c(15:16)])
plot(likert.treatmentA)
# Treatment + Ap: 
treatmentAp <- viabilityLikert %>% filter(condition=="Ap" & results.condition=="treatment") 
likert.treatmentAp <- likert(treatmentAp[-c(15:16)])
plot(likert.treatmentAp)
# Treatment + B: 
treatmentB <- viabilityLikert %>% filter(condition=="B" & results.condition=="treatment") 
likert.treatmentB <- likert(treatmentB[-c(15:16)])
plot(likert.treatmentAp)
## Customize code above to filter on other conditions and treatments of interest. 
```

## Exploratory data analysis part #1: 
```{r}
## Create a new dataframe that converts factors to numeric for statistical analyses: 
stats <- clean %>% mutate_if(is.factor, as.numeric)
for (i in 1:nrow(stats)) {
  stats$sum[i] <- sum(stats[i,1:14])                          
} 
stats$median <- median(stats$sum)
stats$mean <- mean(stats$sum)
## Revalue repeat team: keep plyr b/c some weird R stuff requires library to be called directly (recode values to )
stats$repeatTeam <- plyr::revalue(stats$repeatTeam, c("Yes"="0", "No"="1"))
stats$repeatTeam <- plyr::revalue(stats$repeatTeam, c("Keep this team"="0", "Do not keep this team"="1"))
## Convert to compatible classes for team grouping: 
stats$repeatTeam <- as.numeric(stats$repeatTeam)
stats$results.condition <- unlist(stats$results.condition)
stats$results.format <- as.character(stats$results.format)
stats$room <- unlist(stats$room)
## Dplyr to group teams and summarise variables for each group: group_by to find teams and summarise to compact individual results into group level results (i.e. one row of variable results per team)
groupedProportion <- stats %>%
  group_by(room, batch, round, condition, results.condition, results.format) %>% 
  summarise(n=n(), mean=mean(sum), median=median(sum),prop=sum(repeatTeam)/n) %>% 
## Filter out all teams with n=1 (i.e. a person in a chat room alone)
  filter(n>2)
## Individual proportion: 
individualProportion <- stats %>% group_by(round, batch, room) %>% 
  mutate(sum=sum, mean=mean(sum), median=median(sum), n=n(),prop=sum(repeatTeam)/n) %>% filter(n>1)

## Table showing how many teams we've run in each condition combination:  
table(groupedProportion$condition, groupedProportion$results.condition)
```

## Exploratory data analysis part #2: 
```{r}
## Plot of prop and mean per team with standard error: 
ggplot(groupedProportion, aes(x=prop, y=mean)) + 
  geom_point() +
  stat_smooth(method = "lm", col = "red") + 
  labs(subtitle="", 
       x="Proportion of answers to Q15: ",
       y="Numeric sum of viability measures questions (range: 7-70)") + facet_grid(condition ~ results.format)

## Unlist for condition for graph: 
groupedProportion$results.condition <- unlist(groupedProportion$results.condition)
## Team viability sum and Q15 factor repeat team answers facet_grid by condition, results.condition and results.format (all users) 
g <- ggplot(stats, aes(factor(repeatTeam), sum))
g + geom_boxplot(varwidth=T, fill="plum") + 
  labs(subtitle="Sum of viability measures grouped by team fracture value", 
       x="Binary fracture value",
       y="Numeric sum of viability measures questions (range: 7-70)") + facet_grid(condition ~ results.format)

g <- ggplot(stats, aes(factor(repeatTeam), sum))
g + geom_boxplot(varwidth=T, fill="plum") + 
  labs(subtitle="Individual fracture score vs. mean viability:", 
       x="",
       y="Numeric sum of viability measures questions (range: 7-70)") + facet_grid(condition ~ results.condition)

## groupedProportion: anlaysis on team level 

ggplot(groupedProportion, aes(x=prop, y=mean)) + 
  geom_point() +
  stat_smooth(method = "lm", col = "red") + labs(title="Team fracture proportion vs. team mean viability") 

## Mean vs. median plot for viability sums per team: 
ggplot(groupedProportion, aes(x=mean, y=median)) + 
  geom_point() +
  stat_smooth(method = "lm", col = "red") + labs(main="Scatterplot of median vs. mean for team viability sums")

ggplot(data=groupedProportion, aes(groupedProportion$prop)) + 
  geom_histogram(breaks=seq(0, 1, by=0.20), 
                 col="red", 
                 fill="green", 
                 alpha=.2) + labs(title="Count of team proportions of fracture value responses", 
                                  x="", y="Count") + facet_grid(condition ~ results.format)

```

## Probability of fracture across teams and condition combinations: 
```{r}
## Define and initialize cut-off point for fracture: 
groupedProportionFracture <- groupedProportion %>%
  group_by(results.condition, condition) %>% 
  mutate(fractureGreater50 = case_when (prop<=.50 ~ "0", 
                                prop>.50~ "1"), 
         fractureGreaterEqual50 = case_when (prop<.50 ~ "0", 
                                prop>=.50 ~ "1"))

## Fracture proportion scatterplots: 
## Learning effect // baseline: 
## Make new data that includes teams with different round 1 and round 3 teams: 
groupedProportionFractureLE <- groupedProportionFracture %>% filter(results.format=="c(1, 1, 2)" | results.format=="c(2, 1, 1)" & round=="1" | round=="3") 
fractureLE1 <- groupedProportionFractureLE %>% filter(round=="1")
fractureLE2 <- groupedProportionFractureLE %>% filter(round=="3")

## Treatment: 
groupedProportionFractureTreatment <- groupedProportionFracture %>% 
filter(results.condition=="treatment")
treatmentFracture1 <- groupedProportionFractureTreatment %>% filter(condition=="A")
treatmentFracture2 <- groupedProportionFractureTreatment %>% filter(condition=="Ap")
treatmentFracture <- cbind(treatmentFracture1, treatmentFracture2)

ggplot(treatmentFracture, aes(prop, prop1)) +
  geom_point() +
  geom_jitter() + coord_fixed() + xlim(0.0,1.0) + ylim(0.0,1.0) + labs(title="Proportion of team fracture in the first vs. third in treatment", subtitle="No fracture=0, Fracture=1",  
                                                                       x="Fracture the first time a team interacts", 
                                                                       y="Fracture the second time a team interacts, without knowing it") 

## Control:  
groupedProportionFractureControl <- groupedProportionFracture %>% 
  filter(results.condition=="control") 
controlFracture1 <- groupedProportionFractureControl %>% filter(condition=="A")
controlFracture2 <- groupedProportionFractureControl %>% filter(condition=="Ap")
controlFracture <- cbind(controlFracture1, controlFracture2)

ggplot(controlFracture, aes(prop, prop1)) +
  geom_point() +
  geom_jitter() + coord_fixed() + xlim(0.0,1.0) + ylim(0.0,1.0) + labs(title="Proportion of team fracture in the first vs. third in control", subtitle="No fracture=0, Fracture=1",  
                                                                       x="Fracture the first time a team interacts", 
                                                                       y="Fracture the second time a team interacts, without knowing it") 


g <- ggplot(groupedProportionFracture, aes(factor(fractureGreater50), mean))
g + geom_boxplot(varwidth=T, fill="plum") + 
  labs(subtitle="Team fracture value vs. mean viability score: fracture >0.50", 
       x="",
       y="Numeric sum of viability measures questions (range: 7-70)") 

g <- ggplot(groupedProportionFracture, aes(factor(fractureGreaterEqual50), mean)) 
g + geom_boxplot(varwidth=T, fill="plum") + 
  labs(subtitle="Team fracture value vs. mean viability score: fracture >=0.50", 
       x="",
       y="Numeric sum of viability measures questions (range: 7-70)") 

g <- ggplot(groupedProportionFractureTreatment, aes(factor(fracture), mean))
g + geom_boxplot(varwidth=T, fill="plum") + 
  labs(subtitle="Team fracture value vs. mean viability score: fracture >0.50", 
       x="",
       y="Numeric sum of viability measures questions (range: 7-70)") 


## Defining fracture: 

## TREATMENT EFFECT: 
# ## Overall fracture change: 
# sum(ifelse(treatmentFracture1$fracture==treatmentFracture2$fracture,0,1)) 
# ## No fracture to fracture: 
# sum(ifelse(treatmentFracture1$fracture==0 & treatmentFracture2$fracture==1,1,0))
# ## Fracture to no fracture: 
# sum(ifelse(treatmentFracture1$fracture==1 & treatmentFracture2$fracture==0,1,0))
# ## CONTROL EFFECT: 
# ## Overall fracture change: 
# sum(ifelse(controlFracture1$fracture==controlFracture2$fracture,0,1))
# ## No fracture to fracture: 
# sum(ifelse(controlFracture1$fracture==0 & controlFracture2$fracture==1,1,0))
# ## Fracture to no fracture: 
# sum(ifelse(controlFracture1$fracture==1 & controlFracture2$fracture==0,1,0))
# ## LEARNING EFFECT 
# ## Overall fracture change: 
# sum(ifelse(fractureLE1$fracture==fractureLE2$fracture,0,1)) 
# ## No fracture to fracture:
# sum(ifelse(fractureLE1$fracture==0 & fractureLE2$fracture==1,1,0))
# ## Fracture to no fracture 
# sum(ifelse(fractureLE1$fracture==1 & fractureLE2$fracture==0,1,0))
```

## 


## Probability of fracture given condition in treatment: 
```{r}
conditionalProbTreatment <- groupedProportionFractureTreatment %>% filter(condition=="A" | condition=="Ap")
## Greater than 50: 
tally(~fractureGreater50 | condition, data = conditionalProbTreatment, format = "proportion")
## Greater or equal 50: 
tally(~fractureGreaterEqual50 | condition, data = conditionalProbTreatment, format = "proportion")

conditionalProbControl <- groupedProportionControl %>% filter(condition=="A" | condition=="Ap")
## Greater than 50: 
tally(~fractureGreater50 | condition, data = conditionalProbControl, format = "proportion")
## Greater or equal 50: 
tally(~fractureGreaterEqual50 | condition, data = conditionalProbControl, format = "proportion")

```

## Boxplot for fracture scores vs. viability score sum (7-70)
```{r}
groupedProportionFracture$fractureGreater50 <- as.numeric(groupedProportionFracture$fractureGreater50)
groupedProportionFracture$fractureGreaterEqual50 <- as.numeric(groupedProportionFracture$fractureGreaterEqual50)

g <- ggplot(groupedProportionFracture, aes(factor(fractureGreater50), median))
g + geom_boxplot(varwidth=T, fill="plum") + 
  labs(subtitle="Team median viability score in relation to binary fracture value (greater than 50)", 
       x="Binary fracture value",
       y="Numeric sum of viability measures questions (range: 7-70)") + scale_x_discrete(labels=c("0" = "No Fracture", "1" = "Fracture")) + facet_grid(condition ~ results.condition)

g <- ggplot(groupedProportionFracture, aes(factor(fractureGreater50), median))
g + geom_boxplot(varwidth=T, fill="plum") + 
  labs(subtitle="Team median viability score in relation to binary fracture value (greater than or equal to 50)", 
       x="Binary fracture value",
       y="Numeric sum of viability measures questions (range: 7-70)") + scale_x_discrete(labels=c("0" = "No Fracture", "1" = "Fracture")) + facet_grid(condition ~ results.condition)

treatmentFracture1$fractureGreater50 <- as.numeric(treatmentFracture1$fractureGreater50) 
treatmentFracture1$fractureGreaterEqual50 <- as.numeric(treatmentFracture1$fractureGreaterEqual50) 
treatmentFracture2$fractureGreater50 <- as.numeric(treatmentFracture2$fractureGreater50) 
treatmentFracture2$fractureGreaterEqual50 <- as.numeric(treatmentFracture2$fractureGreaterEqual50) 
controlFracture1$fractureGreater50 <- as.numeric(controlFracture1$fractureGreater50)
controlFracture1$fractureGreaterEqual50 <- as.numeric(controlFracture1$fractureGreaterEqual50)
controlFracture2$fractureGreater50 <- as.numeric(controlFracture2$fractureGreater50)
controlFracture2$fractureGreaterEqual50 <- as.numeric(controlFracture2$fractureGreaterEqual50)
```

## Absolute change in fracture proportions between conditions: 
```{r}
treatmentFracture$absGreater50 <- abs(treatmentFracture2$fractureGreater50-treatmentFracture1$fractureGreater50)
treatmentFracture$absEqualGreater50 <- abs(treatmentFracture2$fractureGreaterEqual50-treatmentFracture1$fractureGreaterEqual50)

g <- ggplot(treatmentFracture, aes(absGreater50)) + scale_fill_brewer(palette = "Spectral")
g + geom_histogram(bins=7, 
                   col="pink", 
                   size=.4) + 
  labs(title="Absolute change in team fracture score in treatment condition (fracture >.50", 
       fill = "Round") + 
  xlab(label="Absolute change in team fracture score in treatment condition") + 
  ylab(label="Count") 

g <- ggplot(treatmentFracture, aes(absEqualGreater50)) + scale_fill_brewer(palette = "Spectral")
g + geom_histogram(bins=7, 
                   col="pink", 
                   size=.4) + 
  labs(title="Absolute change in team fracture score in treatment condition (fracture>=.50)", 
       fill = "Round") + 
  xlab(label="Absolute change in team fracture score in treatment condition") + 
  ylab(label="Count") 

controlFracture$absGreater50 <- abs(controlFracture$fractureGreater50-controlFracture1$fractureGreater50)
g <- ggplot(controlFracture, aes(absGreater50)) + scale_fill_brewer(palette = "Spectral")
g + geom_histogram(bins=7, 
                   col="pink", 
                   size=.4) + 
  labs(title="Absolute change in team fracture score in control condition (fracture >.50)", 
       fill = "Round") + 
  xlab(label="Absolute change in team fracture score in control condition") + 
  ylab(label="Count") 

controlFracture$absEqualGreater50 <- abs(controlFracture$fractureGreaterEqual50-controlFracture1$fractureGreaterEqual50)
g <- ggplot(controlFracture, aes(absEqualGreater50)) + scale_fill_brewer(palette = "Spectral")
g + geom_histogram(bins=7, 
                   col="pink", 
                   size=.4) + 
  labs(title="Absolute change in team fracture score in control condition (fracture >=.50)", 
       fill = "Round") + 
  xlab(label="Absolute change in team fracture score in control condition") + 
  ylab(label="Count") 

controlFracture$absGreaterEqual50 <- abs(controlFracture$fractureGreaterEqual50-controlFracture1$fractureGreaterEqual50)
g <- ggplot(controlFracture, aes(abs)) + scale_fill_brewer(palette = "Spectral")
g + geom_histogram(bins=7, 
                   col="pink", 
                   size=.4) + 
  labs(title="Absolute change in team fracture score in control condition (fracture >=.50)", 
       fill = "Round") + 
  xlab(label="Absolute change in team fracture score in control condition") + 
  ylab(label="Count") 

treatmentFractureEqualGreaterAbsSum <- sum(treatmentFracture$absEqualGreater50) 
treatmentFractureGreaterAbsSum <- sum(treatmentFracture$absGreater50)

## Visualize differences: 

dat <- data.frame(changesPerFractureDefinition = factor(c("Sum of absolute fracture change in treatment >.50","Sum of absolute fracture change in treatment >=.50"), levels=c("Sum of absolute fracture change in treatment >.50","Sum of absolute fracture change in treatment >=.50")),
                  fractureValueChange = c(8, 19))

p <- ggplot(data=dat, aes(x=changesPerFractureDefinition, y=fractureValueChange)) +
  geom_bar(stat="identity")
p <- ggplotly(p)
p

treatmentFractureEqualGreaterAbsSum <- sum(treatmentFracture$absEqualGreater50) 
controlFractureEqualGreaterAbsSum <- sum(controlFracture$absGreaterEqual50)
treatmentFractureGreaterAbsSum <- sum(treatmentFracture$absGreater50)
controlFractureGreaterAbsSum <- sum(controlFracture$absGreater50)

dat <- data.frame(changesPerFractureDefinition = factor(c("Sum of absolute fracture change in treatment >.50","Sum of absolute fracture change in control >.50"), levels=c("Sum of absolute fracture change in treatment >.50","Sum of absolute fracture change in control >.50")),
                  fractureValueChange = c(X, X))

p <- ggplot(data=dat, aes(x=changesPerFractureDefinition, y=fractureValueChange)) +
  geom_bar(stat="identity")
p <- ggplotly(p)
p

dat <- data.frame(changesPerFractureDefinition = factor(c("Sum of absolute fracture change in treatment >=.50","Sum of absolute fracture change in control >=.50"), levels=c("Sum of absolute fracture change in treatment >=.50","Sum of absolute fracture change in control >=.50")),
                  fractureValueChange = c(8, 19))

p <- ggplot(data=dat, aes(x=changesPerFractureDefinition, y=fractureValueChange)) +
  geom_bar(stat="identity")
p <- ggplotly(p)
p
```

```{r}

```




## Histogram comparisons of fracture ratios across treatment, control and learning effect conditions

```{r}
ggplot(data=groupedProportionFractureTreatment, aes(fracture)) + 
  geom_bar(breaks=seq(0, 1, by=0.20), 
                 col="red", 
                 fill="green", 
                 alpha=.2) + labs(title="Fracture ratios: treatment condition") + facet_grid(condition ~ results.condition)

ggplot(data=groupedProportionFractureControl, aes(fracture)) + 
  geom_bar(breaks=seq(0, 1, by=0.20), 
                 col="red", 
                 fill="green", 
                 alpha=.2) + labs(title="Fracture ratios: control round") + facet_grid(condition ~ results.condition)

ggplot(data=groupedProportionFractureLE, aes(fracture)) + 
  geom_bar(breaks=seq(0, 1, by=0.20), 
                 col="red", 
                 fill="green", 
                 alpha=.2) + labs(title="Fracture ratios: learning effect condition") + facet_grid(results.condition ~ round)

```

## Graphs for mean distrbution per treatment condition: 

```{r}
## Mean viability distribution graph: treatment condition, A group 
barfill <- "#4271AE"
barlines <- "#1F3552"
## Mean for treatment and A group: ##fill-in) 
meanDistributionTreatmentA <- ggplot(treatmentAStats, aes(x = sum)) +
  geom_histogram(aes(y = ..count..), binwidth = 5,
                 colour = barlines, fill = barfill) +
  scale_x_continuous(name = "Median viability sum",
                     breaks = seq(0, 100, 20),
                     limits=c(0, 70)) +
  scale_y_continuous(name = "count") +
  ggtitle("Frequency of sum of viability scores: N=##fill-in") +
  theme_bw() +
  theme(axis.line = element_line(size=1, colour = "black"),
        panel.grid.major = element_line(colour = "#d3d3d3"),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(), panel.background = element_blank(),
        plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
        text=element_text(family="Tahoma"),
        axis.text.x=element_text(colour="black", size = 9),
        axis.text.y=element_text(colour="black", size = 9)) +
  geom_vline(xintercept = ##fill-in, size = 1, colour = "#FF3721",
               linetype = "dashed")
meanDistributionTreatmentA

## Mean viability distribution graph: treatment condition, Aprime group 
## Mean for treatment condition, Aprime group: (fill-in)
meanDistributionTreatmentAp <- ggplot(treatmentApStats, aes(x = sum)) +
  geom_histogram(aes(y = ..count..), binwidth = 5,
                 colour = barlines, fill = barfill) +
  scale_x_continuous(name = "Median viability sum",
                     breaks = seq(0, 100, 20),
                     limits=c(0, 70)) +
  scale_y_continuous(name = "count") +
  ggtitle("Frequency of sum of viability scores:N=##fill-in") +
  theme_bw() +
  theme(axis.line = element_line(size=1, colour = "black"),
        panel.grid.major = element_line(colour = "#d3d3d3"),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(), panel.background = element_blank(),
        plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
        text=element_text(family="Tahoma"),
        axis.text.x=element_text(colour="black", size = 9),
        axis.text.y=element_text(colour="black", size = 9)) +
  geom_vline(xintercept = ##fill-in, size = 1, colour = "#FF3721",
               linetype = "dashed")
meanDistributionTreatmentAp

## Mean viability distribution graph: treatment condition, B group 
## Mean for treatment condition, B group: 
barfill <- "#4271AE"
barlines <- "#1F3552"
meanDistributionTreatmentB <- ggplot(treatmentBStats, aes(x = sum)) +
  geom_histogram(aes(y = ..count..), binwidth = 5,
                 colour = barlines, fill = barfill) +
  scale_x_continuous(name = "Median viability sum \n across all teams in masked round",
                     breaks = seq(0, 98, 14),
                     limits=c(7, 70)) +
  scale_y_continuous(name = "count") +
  ggtitle("Frequency of sum of viability scores: masked condition, N=##fill-in") +
  theme_bw() +
  theme(axis.line = element_line(size=1, colour = "black"),
        panel.grid.major = element_line(colour = "#d3d3d3"),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(), panel.background = element_blank(),
        plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
        text=element_text(family="Tahoma"),
        axis.text.x=element_text(colour="black", size = 9),
        axis.text.y=element_text(colour="black", size = 9)) +
  geom_vline(xintercept = ##fill-in, size = 1, colour = "#FF3721",
               linetype = "dashed") 
meanDistributionTreatmentB
```

## Chat data: 
```{r}
chatFiles = lapply(completeBatches, function(batch){
    chatFile = read_json(paste(dataPath,batch,"chats.json",sep="/"), simplifyVector = TRUE)
    return(flatten(chatFile, recursive = TRUE))
  })

allChatFiles <- plyr::ldply(chatFiles, data.frame)
chatFreq <- allChatFiles  %>%
  group_by(round, room, batch) %>% 
  dplyr::summarise(n=n()) %>% 
  filter(n>1, round<=2)

## Filter on round <=2, because round 3 in all but one case is only includes "X has left chat room" 

# Histogram grouped by round: 
g <- ggplot(chatFreq, aes(n)) + scale_fill_brewer(palette = "Spectral")
g + geom_histogram(aes(fill=factor(round)), 
                   bins=5, 
                   col="black", 
                   size=.1) +   # change number of bins
  labs(title="Histogram of the number of chat lines per team (includes teams with n=1)", 
       fill = "Round") + 
  xlab(label="Number of lines from chat data per team") + 
  ylab(label="Count") 
stats$room <- unlist(stats$room) 
chatFreqStats <- right_join(x=chatFreq, y=stats)


## Export chat data for Mav: 
write.csv(allChatFiles, file = "allChatFiles.csv")

```

## Conditional probabilities: 
```{r}
groupedProportionFractureTreatment %>% 
  group_by(condition) %>% 
  summarise(n=n(), count=count(fracture)) 

# P(F3|F1) 
p1 <-  19 / 34

# P(notF3|F1) 
p2 <-  15 / 34

# P(F3|F1)
p3 <- 24 / 34

## P(notF3|F1)
p4 <- 10 / 34

  # Print out the probabilities
round(c(p1, p2, p3, p4), digits = 2)  
```

## CHI Squared: 
```{r}

```


